In [21]:
import pegasus as pg
data=pg.read_input("../Pegasus_072621/sample.main.lowRes.rmDB.rmNA.zarr")
2021-08-02 13:48:17,603 - pegasusio.readwrite - INFO - zarr file '../Pegasus_072621/sample.main.lowRes.rmDB.rmNA.zarr' is loaded.
2021-08-02 13:48:17,604 - pegasusio.readwrite - INFO - Function 'read_input' finished in 59.77s.
In [22]:
pg.qc_metrics(data,min_genes=750,percent_mito=10,max_genes=6000,mito_prefix='MT')
pg.qcviolin(data,plot_type='gene',dpi=75,n_violin_per_panel=40,panel_size=(12,6) )
pg.qcviolin(data,plot_type='mito',dpi=75,n_violin_per_panel=40,panel_size=(12,6) )
In [23]:
pg.filter_data(data)
pg.identify_robust_genes(data)
pg.log_norm(data)
pg.highly_variable_features(data, consider_batch=True,n_top=3000)
2021-08-02 13:48:26,688 - pegasusio.qc_utils - INFO - After filtration, 199617 out of 254321 cell barcodes are kept in UnimodalData object Hg38-rna.
2021-08-02 13:48:26,689 - pegasus.tools.preprocessing - INFO - Function 'filter_data' finished in 2.67s.
2021-08-02 13:48:34,180 - pegasus.tools.preprocessing - INFO - After filtration, 34978/35412 genes are kept. Among 34978 genes, 27574 genes are robust.
2021-08-02 13:48:34,181 - pegasus.tools.preprocessing - INFO - Function 'identify_robust_genes' finished in 7.49s.
2021-08-02 13:48:35,248 - pegasus.tools.preprocessing - WARNING - Rerun log-normalization. Use the raw counts in backup instead.
2021-08-02 13:48:42,118 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 7.94s.
2021-08-02 13:48:43,644 - pegasus.tools.hvf_selection - INFO - Function 'estimate_feature_statistics' finished in 1.52s.
2021-08-02 13:48:43,708 - pegasus.tools.hvf_selection - INFO - 3000 highly variable features have been selected.
2021-08-02 13:48:43,709 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 1.59s.
In [24]:
pg.pca(data,n_components=65)
pg.regress_out(data, attrs=['n_genes', 'percent_mito'])
2021-08-02 13:49:33,937 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 50.22s.
2021-08-02 13:49:35,145 - pegasus.tools.preprocessing - INFO - Function 'regress_out' finished in 1.21s.
Out[24]:
'pca_regressed'
In [25]:
harmony_key = pg.run_harmony(data)
pg.neighbors(data,rep=harmony_key)
pg.louvain(data,rep=harmony_key,resolution=2.3)
pg.umap(data,rep=harmony_key)
pg.scatter(data, attrs=['louvain_labels', 'Channel'], basis='umap')
2021-08-02 13:49:35,153 - pegasus.tools.batch_correction - INFO - Start integration using Harmony.
	Initialization is completed.
	Completed 1 / 10 iteration(s).
	Completed 2 / 10 iteration(s).
	Completed 3 / 10 iteration(s).
Reach convergence after 3 iteration(s).
2021-08-02 13:54:13,605 - pegasus.tools.batch_correction - INFO - Function 'run_harmony' finished in 278.45s.
2021-08-02 13:55:13,867 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 60.26s.
2021-08-02 13:55:20,360 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 6.49s.
2021-08-02 13:55:27,364 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 7.00s.
2021-08-02 13:57:56,838 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 41 clusters.
2021-08-02 13:57:57,057 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 156.69s.
2021-08-02 13:57:57,059 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2021-08-02 13:57:57,060 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2021-08-02 13:57:57,121 - pegasus.tools.visualization - INFO - UMAP(dens_frac=0.0, dens_lambda=0.0, min_dist=0.5, random_state=0, verbose=True)
2021-08-02 13:57:57,122 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2021-08-02 13:57:58,149 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2021-08-02 14:00:51,452 - pegasus.tools.visualization - INFO - Function 'umap' finished in 174.39s.
In [26]:
pg.scatter(data, attrs=['Diagnosis', 'Channel'], basis='umap')
In [27]:
pg.scatter(data, attrs=['louvain_labels'], basis='umap',legend_loc='on data',dpi=200)
In [28]:
# cell composition
import pandas as pd
df_summary=pd.crosstab(data.obs['louvain_labels'], data.obs['Diagnosis'])
df_summary_percent=df_summary/df_summary.sum()*100
A=round(df_summary_percent,2)
A
Out[28]:
Diagnosis ASD CTL
louvain_labels
1 8.89 5.42
2 7.14 7.08
3 7.56 6.08
4 4.66 8.58
5 6.03 5.43
6 5.84 4.88
7 5.05 4.78
8 5.97 2.91
9 2.96 5.49
10 3.90 4.30
11 2.70 4.49
12 3.47 3.74
13 2.84 3.32
14 2.57 3.42
15 2.78 2.98
16 3.55 1.79
17 2.10 2.75
18 2.73 1.81
19 2.38 2.05
20 1.76 2.14
21 2.14 1.61
22 1.30 2.27
23 1.81 1.76
24 1.92 1.32
25 1.15 1.17
26 1.13 0.91
27 1.14 0.77
28 0.95 0.77
29 0.40 1.00
30 0.71 0.57
31 0.31 0.94
32 0.56 0.45
33 0.41 0.44
34 0.10 0.66
35 0.35 0.38
36 0.47 0.26
37 0.02 0.57
38 0.02 0.27
39 0.12 0.16
40 0.05 0.21
41 0.09 0.04
In [29]:
data.obs.to_csv("meta_dim65_res2.3.csv")
Out[29]:
n_genes n_counts Channel BrainRegion Sex Diagnosis Age BA Description IndividualID percent_mito scale Group louvain_labels anno scrublet_score1 doublet_call
barcodekey
0679_BA17-AAACCCAAGAGGTTTA 3615 7289 0679_BA17 Occipital XX CTL 41 BA17 raw Cell barcode by Genes matrix files 10679 1.093122 13.723068 one_group 6 ODC1 0.102901 False
0679_BA17-AAACCCAAGCAATTAG 3558 6708 0679_BA17 Occipital XX CTL 41 BA17 raw Cell barcode by Genes matrix files 10679 0.894478 14.907573 one_group 5 ODC2 0.102901 False
0679_BA17-AAACCCAAGCCGATAG 3944 8908 0679_BA17 Occipital XX CTL 41 BA17 raw Cell barcode by Genes matrix files 10679 0.979737 11.227125 one_group 24 ODC1 0.070102 False
0679_BA17-AAACCCAAGCCTCTGG 4226 11414 0679_BA17 Occipital XX CTL 41 BA17 raw Cell barcode by Genes matrix files 10679 0.718115 8.761938 one_group 24 ODC1 0.040880 False
0679_BA17-AAACCCAAGCGACTGA 3318 6756 0679_BA17 Occipital XX CTL 41 BA17 raw Cell barcode by Genes matrix files 10679 0.969176 14.801658 one_group 2 ODC1 0.082475 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
WQ9-TTTGACTTCACCTTAT 2036 4008 WQ9 Occipital XY ASD 16 BA17 multi 5302 2.359897 24.950100 one_group 31 AST2 0.041762 False
WQ9-TTTGATCGTACAGAAT 2365 4518 WQ9 Occipital XY ASD 16 BA17 multi 5302 1.792403 22.143490 one_group 1 AST1 0.015538 False
WQ9-TTTGGAGCAAGCGAAC 1860 3143 WQ9 Occipital XY ASD 16 BA17 multi 5302 3.716840 31.826862 one_group 19 END 0.003539 False
WQ9-TTTGGAGGTTCCATTT 758 1037 WQ9 Occipital XY ASD 16 BA17 multi 5302 5.007803 96.432015 one_group 5 ODC1 0.006017 False
WQ9-TTTGGTTCAAACTAGA 1409 2468 WQ9 Occipital XY ASD 16 BA17 multi 5302 3.666517 40.535063 one_group 1 AST1 0.026198 False

199617 rows × 17 columns